home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Skunkware 5
/
Skunkware 5.iso
/
src
/
X11
/
wais
/
waisgate
/
irverify.c
< prev
next >
Wrap
C/C++ Source or Header
|
1995-05-09
|
15KB
|
535 lines
/* WIDE AREA INFORMATION SERVER SOFTWARE:
No guarantees or restrictions. See the readme file for the full standard
disclaimer.
Brewster@think.com
*/
/* Change log:
* $Log: irverify.c,v $
* Revision 1.6 92/04/01 09:57:36 morris
* fixed and eof check in readPostings
*
* Revision 1.5 92/03/28 19:48:10 jonathan
* Fixed Log header.
*
* Revision 1.4 92/02/18 15:36:32 morris
* made it faster
*
* Revision 1.3 92/02/12 13:32:33 jonathan
* Added $Log so RCS will put the log message in the header
*
*/
#include "irverify.h"
#include "irfiles.h"
#include "panic.h"
#include "futil.h"
#define TEST_READ false
/*---------------------------------------------------------------------------*/
void
printIndex (db)
database* db;
/* iterate over the index printing the contents */
{
serialPostingFile* spf = NULL;
char indexFileName[MAX_FILE_NAME_LEN + 1];
postingsForATerm* posts = NULL;
spf = initSerialPostingFile(index_filename(indexFileName,db));
while ((posts = getPostingsForNextTerm(spf)) != NULL)
{ printPostingsForATerm(posts);
/* XXX dispose of them */
}
disposeSerialPostingFile(spf);
}
/*---------------------------------------------------------------------------*/
static void
print_dictionary_block_and_index _AP((unsigned char* block,long size,serialPostingFile* spf));
static void
print_dictionary_block_and_index(block,size,spf)
unsigned char *block;
long size;
serialPostingFile* spf;
/* this prints the contents of a dictionary block */
{
long i;
postingsForATerm* posts = NULL;
for(i = 0; i < size; i++)
{
char *word = dictionary_block_word(i, block);
long pos = dictionary_block_position(i, block);
if(word[0] == '\0')
break;
printf("Entry %3ld: %21s %7ld\n", i, word,pos);
posts = getPostingsAt(spf,pos);
printPostingsForATerm(posts);
/* XXX dispose of them postings */
}
}
/*---------------------------------------------------------------------------*/
extern long number_of_dictionary_blocks;
extern unsigned char *dictionary_header_block;
extern unsigned char *dictionary_block;
void
printIndexUsingDictionary(db)
database* db;
/* use the dictionary to go over the index */
{
/* prints the contents of a dictionary */
FILE *dictStream = db->dictionary_stream;
long i;
long new_number_of_dictionary_blocks;
serialPostingFile* spf = NULL;
char indexFileName[MAX_FILE_NAME_LEN + 1];
spf = initSerialPostingFile(index_filename(indexFileName,db));
if(NULL == dictStream)
panic("dictionary dictStream is not open");
s_fseek(dictStream, 0L, SEEK_SET);
new_number_of_dictionary_blocks = read_bytes(DICTIONARY_HEADER_SIZE, dictStream);
if(new_number_of_dictionary_blocks > number_of_dictionary_blocks)
dictionary_header_block = NULL;
number_of_dictionary_blocks = new_number_of_dictionary_blocks;
printf("Number of dictionary blocks %ld\n", number_of_dictionary_blocks);
if(NULL == (dictionary_header_block =
read_dictionary_block(dictionary_header_block,
DICTIONARY_HEADER_SIZE,
number_of_dictionary_blocks,
dictStream)))
panic("Could not read dictionary header block");
printf("The Dictionary Header Block:\n");
print_dictionary_block(dictionary_header_block, number_of_dictionary_blocks);
for(i = 0; i < number_of_dictionary_blocks; i++)
{
long pos = dictionary_block_position(i, dictionary_header_block);
if(NULL == (dictionary_block =
read_dictionary_block(dictionary_block,
pos, DICTIONARY_BLOCK_SIZE, dictStream)))
panic("Could not read dictionary block %ld", pos);
printf("\n\nDictionary block %ld (position %ld):\n", i, pos);
print_dictionary_block_and_index(dictionary_block, DICTIONARY_BLOCK_SIZE,spf);
}
fseek(dictStream, 0L, SEEK_END);
disposeSerialPostingFile(spf);
}
/*---------------------------------------------------------------------------*/
serialPostingFile*
initSerialPostingFile(filename)
char* filename;
/* open an inverted index file create by irn8. return a structure
maintaining its state
*/
{
FILE* stream = NULL;
serialPostingFile* pf = NULL;
stream = s_fopen(filename,"rb");
if (stream == NULL) /* can't open that file */
return(NULL);
s_fseek(stream,INDEX_HEADER_SIZE,SEEK_SET);
pf = (serialPostingFile*)s_malloc((size_t)sizeof(serialPostingFile));
pf->stream = stream;
pf->length = file_length(stream);
pf->current_index_block = INDEX_HEADER_SIZE;
return(pf);
}
/*---------------------------------------------------------------------------*/
void
disposeSerialPostingFile(pf)
serialPostingFile* pf;
{
s_fclose(pf->stream);
s_free(pf);
}
/*---------------------------------------------------------------------------*/
void
printPostingsForATerm(pfat)
postingsForATerm* pfat;
{
long i;
if (pfat->word[0] != '\0')
printf("word '%s'\n",pfat->word);
for (i = 0; i < pfat->entries; i++)
printf("\tdoc %ld weight %ld\n",pfat->docs[i],pfat->weights[i]);
}
/*---------------------------------------------------------------------------*/
postingsForATerm*
getPostingsAt(spf,position)
serialPostingFile* spf;
long position;
/* position better be a valid starting position! */
{
fseek(spf->stream,position,SEEK_SET);
spf->current_index_block = position;
return(getPostingsForNextTerm(spf));
}
/*---------------------------------------------------------------------------*/
void
disposePostingsForATerm(pfat)
postingsForATerm* pfat;
{
s_free(pfat->docs);
s_free(pfat->weights);
s_free(pfat);
}
/*---------------------------------------------------------------------------*/
void
removePostings(pfat,start,run)
postingsForATerm* pfat;
long start;
long run;
/* remove postings start through start + run from the pfat */
{
void* toPtr = NULL;
long runLen;
long toMove;
if (start + run > pfat->entries)
return; /* this is an error */
toPtr = (void*)(pfat->docs + (start * sizeof(docID)));
runLen = run * sizeof(docID);
toMove = ((pfat->entries - start) * sizeof(docID)) - runLen;
memmove(toPtr,toPtr + runLen,toMove);
toPtr = (void*)(pfat->weights + (start * sizeof(postingWeight)));
runLen = run * sizeof(docID);
toMove = ((pfat->entries - start) * sizeof(postingWeight)) - runLen;
memmove(toPtr,toPtr + runLen,toMove);
pfat->entries -= run;
}
/*---------------------------------------------------------------------------*/
void
readPostings(spf,posts,not_full_flag)
serialPostingFile* spf;
postingsForATerm* posts;
long not_full_flag;
{
long count;
long document_id,weight,number_of_valid_entries;
long index_block = read_bytes(NEXT_INDEX_BLOCK_SIZE,spf->stream);
long index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE,spf->stream);
if (EOF == index_block_size)
{ fprintf(stderr,"reading from the index file failed\n");
return;
}
if (not_full_flag == INDEX_BLOCK_NOT_FULL_FLAG) /* not full */
number_of_valid_entries = index_block / INDEX_ELEMENT_SIZE;
else if (not_full_flag == INDEX_BLOCK_FULL_FLAG) /* full */
{ number_of_valid_entries =
(index_block_size - INDEX_BLOCK_HEADER_SIZE) / INDEX_ELEMENT_SIZE;
}
else
{ /* bad news,file is corrupted. this should return error
code rather than panicing XXX */
panic("Expected the flag in the inverted file to be valid. it is %lx",
not_full_flag);
}
posts->docs =
(docID*)s_malloc((size_t)(sizeof(docID) * number_of_valid_entries));
posts->weights =
(postingWeight*)s_malloc((size_t)(sizeof(postingWeight) *
number_of_valid_entries));
for (count = 0; count < number_of_valid_entries; count++)
{ long val;
posts->docs[count] = read_bytes(DOCUMENT_ID_SIZE,spf->stream);
s_fseek(spf->stream,WORD_POSITION_SIZE+CHARACTER_POSITION_SIZE,SEEK_CUR);
val = read_bytes(WEIGHT_SIZE,spf->stream);
if(EOF == val)
{ fprintf(stderr,"reading from the inverted file failed\n");
return;
}
else
{ posts->weights[count] = val;
posts->entries++;
}
}
}
/*---------------------------------------------------------------------------*/
long
readDictionaryIndexBlock(number_of_occurances,word,stream)
long *number_of_occurances;
char *word;
FILE *stream;
/* NOTE - similar to read_dictionary_index_lock */
{
/* this reads the dictionary index block from the index stream.
It assumes the stream is positioned at the right after the flag
returns 0 if it succeeds.
returns -1 if it is at the end of a file.
returns -2 if it read something strange.
Always sets word length to 0 if it fails. */
char temp[MAX_WORD_LENGTH + 2];
word[0] = '\0';
s_fseek(stream,NEXT_INDEX_BLOCK_SIZE+INDEX_BLOCK_SIZE_SIZE,SEEK_CUR);
*number_of_occurances = read_bytes(NUMBER_OF_OCCURANCES_SIZE,stream);
fgets(temp,MAX_WORD_LENGTH + 2,stream); /* 2 is for the \n and '\0' */
/* trim the \n */
if(temp[strlen(temp) - 1] == '\n'){
temp[strlen(temp) - 1] = '\0';
}
strcpy(word, temp);
return(0);
}
/*---------------------------------------------------------------------------*/
postingsForATerm*
getPostingsForNextTerm(spf)
serialPostingFile* spf;
{
postingsForATerm* posts = NULL;
posts = (postingsForATerm*)s_malloc((size_t)sizeof(postingsForATerm));
posts->word[0] = '\0';
posts->entries = 0;
/* this is really a 2 step process - read the dictonary block, then read
the postings. I don't see any reason to unwrap it though */
while (true)
{
long flag = read_bytes(INDEX_BLOCK_FLAG_SIZE,spf->stream);
if (flag == EOF)
{ return(NULL);
}
if (flag == INDEX_BLOCK_DICTIONARY_FLAG) /* the dictionary entry */
{ /* read the dictionary part */
long number_of_occurances;
if (readDictionaryIndexBlock(&number_of_occurances,
posts->word,spf->stream) < 0)
panic("read dictionary index block failed");
}
else /* the posting entry */
{ readPostings(spf,posts,flag);
break;
}
}
return(posts);
}
/*---------------------------------------------------------------------------*/
#ifdef old
these routines are slower thatn the current ones, keep them around for a
while until we are sure the new ones work ok
/*---------------------------------------------------------------------------*/
postingsForATerm*
getPostingsForNextTerm(spf)
serialPostingFile* spf;
{
postingsForATerm* slow;
postingsForATerm* fast;
/*
long filePos = s_ftell(spf->stream);
slow = getPostingsForNextTermSLOW(spf);
printf("SLOW:\n");
printPostingsForATerm(slow);NL();
s_fseek(spf->stream,filePos,SEEK_SET);
*/
fast = getPostingsForNextTermFAST(spf);
/* printf("FAST:\n");
printPostingsForATerm(fast);NL();
disposePostingsForATerm(slow);
*/
return(fast);
}
/*---------------------------------------------------------------------------*/
postingsForATerm*
getPostingsForNextTermSLOW(spf)
serialPostingFile* spf;
{
postingsForATerm* posts = NULL;
boolean keepGoing = true;
if (spf->current_index_block >= spf->length)
return(NULL);
posts = (postingsForATerm*)s_malloc((size_t)sizeof(postingsForATerm));
posts->word[0] = '\0';
posts->entries = 0;
while (keepGoing)
{
long flag = read_bytes(INDEX_BLOCK_FLAG_SIZE,spf->stream);
long next_index_block = read_bytes(NEXT_INDEX_BLOCK_SIZE,spf->stream);
long index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE,spf->stream);
if (flag == INDEX_BLOCK_DICTIONARY_FLAG)
{ long last_index_block;
long index_block_size;
long number_of_occurances;
char word[MAX_WORD_LENGTH + 1];
if (0 > read_dictionary_index_block(spf->current_index_block,
&last_index_block,
&index_block_size,
&number_of_occurances,
word,
spf->stream))
panic("read dictionary index block failed");
cprintf(TEST_READ,
"%ld: size %3ld word '%s',occurances %ld last block %ld\n",
spf->current_index_block,index_block_size,word,
number_of_occurances,next_index_block);
strcpy(posts->word,word);
}
else if (flag == INDEX_BLOCK_NOT_FULL_FLAG)
{ cprintf(TEST_READ,"%ld: size %3ld Not full,valid entries %ld\n",
spf->current_index_block,index_block_size,next_index_block);
readPostings(spf,posts);
keepGoing = false;
}
else if (flag == INDEX_BLOCK_FULL_FLAG)
{ cprintf(TEST_READ,"%ld: size %3ld full block,next block %ld\n",
spf->current_index_block,index_block_size,next_index_block);
readPostings(spf,posts);
keepGoing = false;
}
else
panic("bad entry %ld (ftell %ld),flag was %ld",
spf->current_index_block,ftell(spf->stream),flag);
spf->current_index_block += index_block_size;
s_fseek(spf->stream,spf->current_index_block,SEEK_SET);
}
return(posts);
}
/*---------------------------------------------------------------------------*/
void
readPostings(spf,posts)
serialPostingFile* spf;
postingsForATerm* posts;
{
long not_full_flag = INDEX_BLOCK_FULL_FLAG;
long count,index_block_size;
long document_id,weight,number_of_valid_entries;
long index_block = spf->current_index_block;
if (index_block >= 0)
{
/* read the index block */
if (0 != fseek(spf->stream,(long)index_block,SEEK_SET))
{
fprintf(stderr,
"fseek failed into the inverted file to position %ld\n",
(long)index_block);
return;
}
not_full_flag = read_bytes(INDEX_BLOCK_FLAG_SIZE,spf->stream);
index_block = read_bytes(NEXT_INDEX_BLOCK_SIZE,spf->stream);
index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE,spf->stream);
if (EOF == index_block_size)
{ fprintf(stderr,"reading from the index file failed\n");
return;
}
if (not_full_flag == INDEX_BLOCK_NOT_FULL_FLAG)
{ /* not full */
number_of_valid_entries = index_block;
}
else if (not_full_flag == INDEX_BLOCK_FULL_FLAG)
{ /* full */
number_of_valid_entries = index_block_size - INDEX_BLOCK_HEADER_SIZE;
}
else
{ /* bad news,file is corrupted. this should return error
code rather than panicing XXX */
panic("Expected the flag in the inverted file to be valid. it is %ld",
not_full_flag);
}
cprintf(TEST_READ," number of valid bytes: %ld\n",
number_of_valid_entries);
for (count = 0; count < number_of_valid_entries;
count = count + INDEX_ELEMENT_SIZE)
{
document_id = read_bytes(DOCUMENT_ID_SIZE,spf->stream);
(void)read_bytes(WORD_POSITION_SIZE,spf->stream);
(void)read_bytes(CHARACTER_POSITION_SIZE,spf->stream);
weight = read_bytes(WEIGHT_SIZE,spf->stream);
cprintf(TEST_READ," entry %ld,Doc_id: %ld,weight %ld\n",
count % INDEX_ELEMENT_SIZE,document_id,weight);
if(EOF == weight)
{ fprintf(stderr,"reading from the doc-id table failed\n");
return;
}
posts->entries++;
posts->docs = (docID*)s_realloc(posts->docs,
(size_t)(sizeof(docID) * posts->entries));
posts->docs[posts->entries - 1] = document_id;
posts->weights = (postingWeight*)s_realloc(posts->weights,
(size_t)(sizeof(postingWeight) *
posts->entries));
posts->weights[posts->entries - 1] = weight;
}
}
}
/*---------------------------------------------------------------------------*/
#endif /* ndef old */